1 Intalling Required Packages

packages = c('rgdal', 'spdep', 'tmap', 'sf', 'ggpubr', 'cluster', 'factoextra', 'NbClust', 'heatmaply', 'corrplot', 'psych', 'tidyverse')
for (p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}
## Warning: package 'rgdal' was built under R version 4.0.3
## Warning: package 'sp' was built under R version 4.0.3
## Warning: package 'sf' was built under R version 4.0.3
## Warning: package 'factoextra' was built under R version 4.0.3
## Warning: package 'NbClust' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3

2 Data Import and Preparation

2.1 Importing Aspatial and Spatial Data

corp_info_merged <- read_csv("data/aspatial/corp_info_merged.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   uen = col_character(),
##   registration_year = col_double(),
##   primary_ssic_code = col_character(),
##   postal_code = col_double(),
##   category = col_character(),
##   primary_ssic_category_description = col_character(),
##   X_coord = col_double(),
##   Y_coord = col_double()
## )
corp_info_merged <- st_as_sf(corp_info_merged, coords = c('X_coord','Y_coord'), crs = 3414)
ssic2020 <- read_csv("data/aspatial/ssic2020.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   `SSIC 2020` = col_character(),
##   category = col_character(),
##   primary_ssic_code = col_character()
## )
postal_code_geom <- read_csv("data/aspatial/postal_code_geom.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   postal_code = col_double(),
##   X_coord = col_double(),
##   Y_coord = col_double()
## )
mpsz <- st_read(dsn = "data/geospatial", layer="MP14_SUBZONE_WEB_PL") %>%
  filter(!(PLN_AREA_N %in% c("NORTH-EASTERN ISLANDS",
                             "CENTRAL WATER CATCHMENT",
                             "CHANGI BAY",
                             "MARINA SOUTH",
                             "SIMPANG",
                             "SOUTHERN ISLANDS",
                             "STRAITS VIEW",
                             "TENGAH")))
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `D:\GitHub\is415-proj\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## projected CRS:  SVY21

2.2 Checking the CRS for Spatial Data

mpsz
## Simple feature collection with 311 features and 15 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 50293.96 ymax: 50256.33
## projected CRS:  SVY21
## First 10 features:
##    OBJECTID SUBZONE_NO       SUBZONE_N SUBZONE_C CA_IND      PLN_AREA_N
## 1         2          1    PEARL'S HILL    OTSZ01      Y          OUTRAM
## 2         3          3       BOAT QUAY    SRSZ03      Y SINGAPORE RIVER
## 3         4          8  HENDERSON HILL    BMSZ08      N     BUKIT MERAH
## 4         5          3         REDHILL    BMSZ03      N     BUKIT MERAH
## 5         6          7  ALEXANDRA HILL    BMSZ07      N     BUKIT MERAH
## 6         7          9   BUKIT HO SWEE    BMSZ09      N     BUKIT MERAH
## 7         8          2     CLARKE QUAY    SRSZ02      Y SINGAPORE RIVER
## 8         9         13 PASIR PANJANG 1    QTSZ13      N      QUEENSTOWN
## 9        10          7       QUEENSWAY    QTSZ07      N      QUEENSTOWN
## 10       11         12      KENT RIDGE    QTSZ12      N      QUEENSTOWN
##    PLN_AREA_C       REGION_N REGION_C          INC_CRC FMEL_UPD_D   X_ADDR
## 1          OT CENTRAL REGION       CR 8C7149B9EB32EEFC 2014-12-05 28679.06
## 2          SR CENTRAL REGION       CR C35FEFF02B13E0E5 2014-12-05 29654.96
## 3          BM CENTRAL REGION       CR 3775D82C5DDBEFBD 2014-12-05 26782.83
## 4          BM CENTRAL REGION       CR 85D9ABEF0A40678F 2014-12-05 26201.96
## 5          BM CENTRAL REGION       CR 9D286521EF5E3B59 2014-12-05 25358.82
## 6          BM CENTRAL REGION       CR 7839A8577144EFE2 2014-12-05 27680.06
## 7          SR CENTRAL REGION       CR 48661DC0FBA09F7A 2014-12-05 29253.21
## 8          QT CENTRAL REGION       CR 1F721290C421BFAB 2014-12-05 22077.34
## 9          QT CENTRAL REGION       CR 3580D2AFFBEE914C 2014-12-05 24168.31
## 10         QT CENTRAL REGION       CR 601BA309A1AAC731 2014-12-05 23464.84
##      Y_ADDR SHAPE_Leng SHAPE_Area                       geometry
## 1  29782.05   3506.107   559816.2 MULTIPOLYGON (((29092.28 30...
## 2  29974.66   1740.926   160807.5 MULTIPOLYGON (((29932.33 29...
## 3  29933.77   3313.625   595428.9 MULTIPOLYGON (((27131.28 30...
## 4  30005.70   2825.594   387429.4 MULTIPOLYGON (((26451.03 30...
## 5  29991.38   4428.913  1030378.8 MULTIPOLYGON (((25899.7 297...
## 6  30230.86   3275.312   551732.0 MULTIPOLYGON (((27746.95 30...
## 7  30222.86   2208.619   290184.7 MULTIPOLYGON (((29351.26 29...
## 8  29893.78   6571.323  1084792.3 MULTIPOLYGON (((20996.49 30...
## 9  30104.18   3454.239   631644.3 MULTIPOLYGON (((24472.11 29...
## 10 29725.37   7439.548  1826848.6 MULTIPOLYGON (((23332.77 30...
st_crs(mpsz)
## Coordinate Reference System:
##   User input: SVY21 
##   wkt:
## PROJCRS["SVY21",
##     BASEGEOGCRS["SVY21[WGS84]",
##         DATUM["World Geodetic System 1984",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]],
##             ID["EPSG",6326]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["Degree",0.0174532925199433]]],
##     CONVERSION["unnamed",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["(E)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1,
##                 ID["EPSG",9001]]],
##         AXIS["(N)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1,
##                 ID["EPSG",9001]]]]

2.3 Transforming the CRS for Spatial Data

mpsz_3414 <- st_transform(mpsz, 3414)
st_crs(mpsz_3414)
## Coordinate Reference System:
##   User input: EPSG:3414 
##   wkt:
## PROJCRS["SVY21 / Singapore TM",
##     BASEGEOGCRS["SVY21",
##         DATUM["SVY21",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4757]],
##     CONVERSION["Singapore Transverse Mercator",
##         METHOD["Transverse Mercator",
##             ID["EPSG",9807]],
##         PARAMETER["Latitude of natural origin",1.36666666666667,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",103.833333333333,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["Scale factor at natural origin",1,
##             SCALEUNIT["unity",1],
##             ID["EPSG",8805]],
##         PARAMETER["False easting",28001.642,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",38744.572,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["northing (N)",north,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["easting (E)",east,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["Singapore"],
##         BBOX[1.13,103.59,1.47,104.07]],
##     ID["EPSG",3414]]

2.4 Converting to Spatial or Spatial* Equivalents

corp_info_merged_sp <- as(corp_info_merged, "Spatial")
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition
## Warning in showSRID(SRS_string, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum SVY21 in CRS definition
corp_info_merged_sp <- as(corp_info_merged_sp, "SpatialPoints")
mpsz_sp <- as(mpsz_3414, "Spatial")
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition
## Warning in showSRID(SRS_string, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum SVY21 in CRS definition
plot(mpsz_sp, border="darkgrey") +
plot(corp_info_merged_sp, add=TRUE)

## integer(0)
unique(corp_info_merged$category)
##  [1] "G" "F" "H" "C" "N" "I" "S" "M" "Q" "L" "J" "R" "P" "E" "K" "D" "A" "O"
for (category_id in unique(corp_info_merged$category)) {
  corp_with_category <- corp_info_merged %>%
    filter(category == category_id)
  mpsz_3414[, paste0("Category ", category_id)]<- lengths(st_intersects(mpsz_3414, corp_with_category))
}
mpsz_3414 <- mpsz_3414 %>%
  mutate(Total = rowSums(across("Category G":"Category O")))
print(mean(mpsz_3414$Total))
## [1] 362.1286
mpsz_3414 <- mpsz_3414 %>%
  mutate(`Cat G Prop` = case_when(Total != 0 ~ `Category G`/Total * 1000,
                                             Total == 0 ~ 0)) %>%
  mutate(`Cat F Prop` = case_when(Total != 0 ~ `Category F`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat H Prop` = case_when(Total != 0 ~ `Category H`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat C Prop` = case_when(Total != 0 ~ `Category C`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat N Prop` = case_when(Total != 0 ~ `Category N`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat I Prop` = case_when(Total != 0 ~ `Category I`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat S Prop` = case_when(Total != 0 ~ `Category S`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat M Prop` = case_when(Total != 0 ~ `Category M`/Total * 1000,
                                       Total == 0 ~ 0)) %>%
  mutate(`Cat Q Prop` = case_when(Total != 0 ~ `Category Q`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat L Prop` = case_when(Total != 0 ~ `Category L`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat J Prop` = case_when(Total != 0 ~ `Category J`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat R Prop` = case_when(Total != 0 ~ `Category R`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat P Prop` = case_when(Total != 0 ~ `Category P`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat E Prop` = case_when(Total != 0 ~ `Category E`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat K Prop` = case_when(Total != 0 ~ `Category K`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat D Prop` = case_when(Total != 0 ~ `Category D`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat A Prop` = case_when(Total != 0 ~ `Category A`/Total * 1000,
                                         Total == 0 ~ 0)) %>%
  mutate(`Cat O Prop` = case_when(Total != 0 ~ `Category O`/Total * 1000,
                                         Total == 0 ~ 0))

3 Exploratory Data Analysis

3.1 Visualisation of Statistics

ggplot(data=mpsz_3414, aes(x=`Category M`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

ggplot(data=mpsz_3414, aes(x=`Category M`)) +
  geom_boxplot(color="black", fill="light blue")

ggplot(data=mpsz_3414, aes(x=`Cat M Prop`)) +
  geom_histogram(bins=20, color="black", fill="light blue")

ggplot(data=mpsz_3414, aes(x=`Cat M Prop`)) +
  geom_boxplot(color="black", fill="light blue")

cat_g <- ggplot(data=mpsz_3414, 
             aes(x= `Cat G Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_f <- ggplot(data=mpsz_3414, 
             aes(x= `Cat F Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_h <- ggplot(data=mpsz_3414, 
             aes(x= `Cat H Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_c <- ggplot(data=mpsz_3414, 
             aes(x= `Cat C Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_n <- ggplot(data=mpsz_3414, 
             aes(x= `Cat N Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_i <- ggplot(data=mpsz_3414, 
             aes(x= `Cat I Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")
ggarrange(cat_g, cat_f, cat_h, cat_c, cat_n, cat_i, 
          ncol = 3, 
          nrow = 2)

cat_s <- ggplot(data=mpsz_3414, 
             aes(x= `Cat S Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_m <- ggplot(data=mpsz_3414, 
             aes(x= `Cat M Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_q <- ggplot(data=mpsz_3414, 
             aes(x= `Cat Q Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_l <- ggplot(data=mpsz_3414, 
             aes(x= `Cat L Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_j <- ggplot(data=mpsz_3414, 
             aes(x= `Cat J Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_r <- ggplot(data=mpsz_3414, 
             aes(x= `Cat R Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")
ggarrange(cat_s, cat_m, cat_q, cat_l, cat_j, cat_r, 
          ncol = 3, 
          nrow = 2)

cat_p <- ggplot(data=mpsz_3414, 
             aes(x= `Cat P Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_e <- ggplot(data=mpsz_3414, 
             aes(x= `Cat E Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_k <- ggplot(data=mpsz_3414, 
             aes(x= `Cat K Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_d <- ggplot(data=mpsz_3414, 
             aes(x= `Cat D Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_a <- ggplot(data=mpsz_3414, 
             aes(x= `Cat A Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

cat_o <- ggplot(data=mpsz_3414, 
             aes(x= `Cat O Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")
ggarrange(cat_p, cat_e, cat_k, cat_d, cat_a, cat_o, 
          ncol = 3, 
          nrow = 2)

3.2 Choropleths

qtm(mpsz_3414, "Cat H Prop")

qtm(mpsz_3414, "Total")

3.3 Correlation Analysis

mpsz_3414_derived <- st_drop_geometry(mpsz_3414)
clustering_data <- mpsz_3414_derived[,35:52]
cluster_vars.cor = cor(clustering_data)
corrplot.mixed(cluster_vars.cor,
         lower = "ellipse", 
               upper = "number",
               tl.pos = "lt",
               diag = "l",
               tl.col = "black")

4 Extracting Clustering Variables

cluster_vars <- mpsz_3414_derived %>%
  select("SUBZONE_N", ends_with("Prop"), -`Cat I Prop`, -`Cat L Prop`, -`Cat H Prop`, -`Cat P Prop`, -`Cat N Prop`, -`Cat J Prop`, -`Cat M Prop`, -`Cat F Prop`)
head(cluster_vars,10)
##          SUBZONE_N Cat G Prop Cat C Prop Cat S Prop Cat Q Prop Cat R Prop
## 1     PEARL'S HILL  232.77910   24.94062   20.19002  20.190024   9.501188
## 2        BOAT QUAY    0.00000    0.00000    0.00000   0.000000   0.000000
## 3   HENDERSON HILL  316.32653   10.20408   30.61224  40.816327  30.612245
## 4          REDHILL  200.00000    0.00000   75.00000  16.666667  16.666667
## 5   ALEXANDRA HILL  221.12211   36.30363   51.70517  11.001100   6.600660
## 6    BUKIT HO SWEE  230.28391   22.08202   59.93691  34.700315  50.473186
## 7      CLARKE QUAY    0.00000    0.00000   40.00000   0.000000 400.000000
## 8  PASIR PANJANG 1  204.08163   35.71429   51.02041  30.612245  56.122449
## 9        QUEENSWAY  555.55556   15.87302   79.36508  31.746032  47.619048
## 10      KENT RIDGE   58.82353   78.43137    0.00000   9.803922   0.000000
##    Cat E Prop Cat K Prop Cat D Prop Cat A Prop Cat O Prop
## 1    0.000000  140.14252   1.187648   2.375297   0.000000
## 2    0.000000    0.00000   0.000000   0.000000   0.000000
## 3   10.204082   30.61224   0.000000  10.204082   0.000000
## 4    0.000000   75.00000   0.000000   0.000000   0.000000
## 5    0.000000  104.51045   2.200220   7.700770   0.000000
## 6    0.000000   66.24606   0.000000   6.309148   0.000000
## 7    0.000000   40.00000   0.000000   0.000000   0.000000
## 8    5.102041   56.12245   0.000000   0.000000   5.102041
## 9    0.000000   15.87302   0.000000   0.000000   0.000000
## 10   0.000000  147.05882   0.000000  19.607843   0.000000
row.names(cluster_vars) <- cluster_vars$`SUBZONE_N`
head(cluster_vars,10)
##                       SUBZONE_N Cat G Prop Cat C Prop Cat S Prop Cat Q Prop
## PEARL'S HILL       PEARL'S HILL  232.77910   24.94062   20.19002  20.190024
## BOAT QUAY             BOAT QUAY    0.00000    0.00000    0.00000   0.000000
## HENDERSON HILL   HENDERSON HILL  316.32653   10.20408   30.61224  40.816327
## REDHILL                 REDHILL  200.00000    0.00000   75.00000  16.666667
## ALEXANDRA HILL   ALEXANDRA HILL  221.12211   36.30363   51.70517  11.001100
## BUKIT HO SWEE     BUKIT HO SWEE  230.28391   22.08202   59.93691  34.700315
## CLARKE QUAY         CLARKE QUAY    0.00000    0.00000   40.00000   0.000000
## PASIR PANJANG 1 PASIR PANJANG 1  204.08163   35.71429   51.02041  30.612245
## QUEENSWAY             QUEENSWAY  555.55556   15.87302   79.36508  31.746032
## KENT RIDGE           KENT RIDGE   58.82353   78.43137    0.00000   9.803922
##                 Cat R Prop Cat E Prop Cat K Prop Cat D Prop Cat A Prop
## PEARL'S HILL      9.501188   0.000000  140.14252   1.187648   2.375297
## BOAT QUAY         0.000000   0.000000    0.00000   0.000000   0.000000
## HENDERSON HILL   30.612245  10.204082   30.61224   0.000000  10.204082
## REDHILL          16.666667   0.000000   75.00000   0.000000   0.000000
## ALEXANDRA HILL    6.600660   0.000000  104.51045   2.200220   7.700770
## BUKIT HO SWEE    50.473186   0.000000   66.24606   0.000000   6.309148
## CLARKE QUAY     400.000000   0.000000   40.00000   0.000000   0.000000
## PASIR PANJANG 1  56.122449   5.102041   56.12245   0.000000   0.000000
## QUEENSWAY        47.619048   0.000000   15.87302   0.000000   0.000000
## KENT RIDGE        0.000000   0.000000  147.05882   0.000000  19.607843
##                 Cat O Prop
## PEARL'S HILL      0.000000
## BOAT QUAY         0.000000
## HENDERSON HILL    0.000000
## REDHILL           0.000000
## ALEXANDRA HILL    0.000000
## BUKIT HO SWEE     0.000000
## CLARKE QUAY       0.000000
## PASIR PANJANG 1   5.102041
## QUEENSWAY         0.000000
## KENT RIDGE        0.000000
sg_business <- select(cluster_vars, c(2:11))
row.names(sg_business) <- cluster_vars$`SUBZONE_N`
head(sg_business, 10)
##                 Cat G Prop Cat C Prop Cat S Prop Cat Q Prop Cat R Prop
## PEARL'S HILL     232.77910   24.94062   20.19002  20.190024   9.501188
## BOAT QUAY          0.00000    0.00000    0.00000   0.000000   0.000000
## HENDERSON HILL   316.32653   10.20408   30.61224  40.816327  30.612245
## REDHILL          200.00000    0.00000   75.00000  16.666667  16.666667
## ALEXANDRA HILL   221.12211   36.30363   51.70517  11.001100   6.600660
## BUKIT HO SWEE    230.28391   22.08202   59.93691  34.700315  50.473186
## CLARKE QUAY        0.00000    0.00000   40.00000   0.000000 400.000000
## PASIR PANJANG 1  204.08163   35.71429   51.02041  30.612245  56.122449
## QUEENSWAY        555.55556   15.87302   79.36508  31.746032  47.619048
## KENT RIDGE        58.82353   78.43137    0.00000   9.803922   0.000000
##                 Cat E Prop Cat K Prop Cat D Prop Cat A Prop Cat O Prop
## PEARL'S HILL      0.000000  140.14252   1.187648   2.375297   0.000000
## BOAT QUAY         0.000000    0.00000   0.000000   0.000000   0.000000
## HENDERSON HILL   10.204082   30.61224   0.000000  10.204082   0.000000
## REDHILL           0.000000   75.00000   0.000000   0.000000   0.000000
## ALEXANDRA HILL    0.000000  104.51045   2.200220   7.700770   0.000000
## BUKIT HO SWEE     0.000000   66.24606   0.000000   6.309148   0.000000
## CLARKE QUAY       0.000000   40.00000   0.000000   0.000000   0.000000
## PASIR PANJANG 1   5.102041   56.12245   0.000000   0.000000   5.102041
## QUEENSWAY         0.000000   15.87302   0.000000   0.000000   0.000000
## KENT RIDGE        0.000000  147.05882   0.000000  19.607843   0.000000

5 Data Standardisation

5.1 Min-max Standardisation

sg_business.minmax <- normalize(sg_business)
summary(sg_business.minmax)
##    Cat G Prop       Cat C Prop        Cat S Prop        Cat Q Prop     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.3298   1st Qu.:0.01735   1st Qu.:0.01758   1st Qu.:0.00000  
##  Median :0.4500   Median :0.03810   Median :0.04400   Median :0.01569  
##  Mean   :0.3944   Mean   :0.05751   Mean   :0.05124   Mean   :0.02767  
##  3rd Qu.:0.5027   3rd Qu.:0.05700   3rd Qu.:0.06573   3rd Qu.:0.03130  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##    Cat R Prop        Cat E Prop        Cat K Prop        Cat D Prop     
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.03558   1st Qu.:0.00000  
##  Median :0.02358   Median :0.00000   Median :0.09278   Median :0.00000  
##  Mean   :0.03392   Mean   :0.04249   Mean   :0.13440   Mean   :0.01169  
##  3rd Qu.:0.04049   3rd Qu.:0.03068   3rd Qu.:0.16699   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##    Cat A Prop        Cat O Prop     
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000  
##  Mean   :0.01913   Mean   :0.01173  
##  3rd Qu.:0.01499   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000

5.2 Z-score Standardisation

sg_business.zscore <- scale(sg_business)
summary(sg_business.zscore)
##    Cat G Prop        Cat C Prop          Cat S Prop        Cat Q Prop      
##  Min.   :-1.9781   Min.   :-0.667310   Min.   :-0.6950   Min.   :-0.33548  
##  1st Qu.:-0.3239   1st Qu.:-0.465962   1st Qu.:-0.4565   1st Qu.:-0.33548  
##  Median : 0.2788   Median :-0.225306   Median :-0.0982   Median :-0.14531  
##  Mean   : 0.0000   Mean   : 0.000000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.5429   3rd Qu.:-0.005913   3rd Qu.: 0.1966   3rd Qu.: 0.04402  
##  Max.   : 3.0373   Max.   :10.935294   Max.   :12.8682   Max.   :11.78784  
##    Cat R Prop         Cat E Prop        Cat K Prop        Cat D Prop     
##  Min.   :-0.50735   Min.   :-0.3607   Min.   :-0.8167   Min.   :-0.1847  
##  1st Qu.:-0.50735   1st Qu.:-0.3607   1st Qu.:-0.6005   1st Qu.:-0.1847  
##  Median :-0.15457   Median :-0.3607   Median :-0.2529   Median :-0.1847  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.09824   3rd Qu.:-0.1002   3rd Qu.: 0.1981   3rd Qu.:-0.1847  
##  Max.   :14.45027   Max.   : 8.1280   Max.   : 5.2600   Max.   :15.6156  
##    Cat A Prop         Cat O Prop     
##  Min.   :-0.24228   Min.   :-0.1296  
##  1st Qu.:-0.24228   1st Qu.:-0.1296  
##  Median :-0.24228   Median :-0.1296  
##  Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.:-0.05243   3rd Qu.:-0.1296  
##  Max.   :12.41937   Max.   :10.9190

5.3 Visualising the standardised clustering variables

r <- ggplot(data=mpsz_3414_derived, 
             aes(x= `Cat G Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue")

sg_business_minmax_df <- as.data.frame(sg_business.minmax)
s <- ggplot(data=sg_business_minmax_df, 
       aes(x=`Cat G Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Min-Max Standardisation")

sg_business_zscore_df <- as.data.frame(sg_business.zscore)
z <- ggplot(data=sg_business_zscore_df, 
       aes(x=`Cat G Prop`)) +
  geom_histogram(bins=20, 
                 color="black", 
                 fill="light blue") +
  ggtitle("Z-score Standardisation")

ggarrange(r, s, z,
          ncol = 3,
          nrow = 1)

6 Hierarchical Cluster Analysis

6.1 Computing Proximity Matrix

proxmat <- dist(sg_business, method = 'euclidean')

6.2 Computing Hierarchical Clustering

hclust_ward <- hclust(proxmat, method = 'ward.D')
plot(hclust_ward, cex = 0.6)

6.3 Selecting Optimal Clustering Algorithm

m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

ac <- function(x) {
  agnes(sg_business, method = x)$ac
}

map_dbl(m, ac)
##   average    single  complete      ward 
## 0.9506336 0.9528419 0.9625796 0.9765662

With reference to the output above, we can see that Ward’s method provides the strongest clustering structure among the four methods assessed. Hence, in the subsequent analysis, only Ward’s method will be used.

6.4 Determining Optimal Clusters

set.seed(54321)
gap_stat <- clusGap(sg_business, FUN = hcut, nstart = 25, K.max = 25, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = sg_business, FUNcluster = hcut, K.max = 25, B = 50,     nstart = 25)
## B=50 simulated reference sets, k = 1..25; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 2
##           logW   E.logW      gap      SE.sim
##  [1,] 9.684116 11.05891 1.374792 0.009404075
##  [2,] 9.454766 10.95801 1.503248 0.014234872
##  [3,] 9.383786 10.88580 1.502018 0.013384406
##  [4,] 9.242075 10.83298 1.590905 0.013008459
##  [5,] 9.200741 10.79124 1.590501 0.013121701
##  [6,] 9.089472 10.75477 1.665294 0.013452089
##  [7,] 9.027474 10.72292 1.695445 0.013054442
##  [8,] 8.954053 10.69291 1.738861 0.012935398
##  [9,] 8.917038 10.66569 1.748655 0.012722686
## [10,] 8.875763 10.64031 1.764545 0.012340370
## [11,] 8.849896 10.61687 1.766976 0.012043180
## [12,] 8.827149 10.59440 1.767253 0.011937704
## [13,] 8.771298 10.57279 1.801489 0.011870034
## [14,] 8.726574 10.55274 1.826166 0.012276322
## [15,] 8.705511 10.53376 1.828245 0.012444394
## [16,] 8.683364 10.51549 1.832128 0.012348166
## [17,] 8.634498 10.49804 1.863547 0.012293103
## [18,] 8.608160 10.48126 1.873099 0.012298986
## [19,] 8.590848 10.46520 1.874352 0.012186732
## [20,] 8.551617 10.44988 1.898260 0.012106799
## [21,] 8.529918 10.43501 1.905088 0.011865180
## [22,] 8.512432 10.42046 1.908028 0.011846658
## [23,] 8.487536 10.40642 1.918884 0.011718945
## [24,] 8.469926 10.39259 1.922663 0.011585602
## [25,] 8.451780 10.37911 1.927334 0.011552423
fviz_gap_stat(gap_stat)

6.5 Interpreting the Dendrogram

plot(hclust_ward, cex = 0.6)
rect.hclust(hclust_ward, k = 10, border = 2:5)

6.6 Visually Driven Hierarchical Analysis

sg_business_mat <- data.matrix(sg_business)
heatmaply(normalize(sg_business_mat),
          Colv=NA,
          dist_method = "euclidean",
          hclust_method = "ward.D",
          seriate = "OLO",
          colors = Blues,
          k_row = 10,
          margins = c(NA,200,60,NA),
          fontsize_row = 4,
          fontsize_col = 5,
          main="Geographic Segmentation of Singapore by Business Prominence",
          xlab = "Business Prominence",
          ylab = "Singapore's Planning Areas"
          )

6.7 Mapping the Clusters Formed

groups <- as.factor(cutree(hclust_ward, k=10))
sg_biz_cluster <- cbind(mpsz_3414, as.matrix(groups)) %>%
  rename(`CLUSTER`=`as.matrix.groups.`)
qtm(sg_biz_cluster, "CLUSTER")

7 Spatially Constrained Clustering (SKATER)

7.1 Conversion to SpatialPolygonsDataFrame

mpsz_3414_sp <- as_Spatial(mpsz_3414)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition
## Warning in showSRID(SRS_string, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum SVY21 in CRS definition

7.2 Computing Neighbour List

sg.nb <- poly2nb(mpsz_3414_sp)
summary(sg.nb)
## Neighbour list object:
## Number of regions: 311 
## Number of nonzero links: 1838 
## Percentage nonzero weights: 1.900311 
## Average number of links: 5.909968 
## 2 regions with no links:
## 16 17
## Link number distribution:
## 
##  0  1  2  3  4  5  6  7  8  9 10 11 13 
##  2  1  4 13 32 78 80 48 32 16  1  3  1 
## 1 least connected region:
## 15 with 1 link
## 1 most connected region:
## 279 with 13 links
mpsz_3414_sp$SUBZONE_N[16]
## [1] "SUDONG"
mpsz_3414_sp$SUBZONE_N[17]
## [1] "SEMAKAU"
mpsz_3414_sp <- mpsz_3414_sp[!mpsz_3414$SUBZONE_N %in% c("SUDONG", "SEMAKAU"),]
sg.nb <- poly2nb(mpsz_3414_sp)
summary(sg.nb)
## Neighbour list object:
## Number of regions: 309 
## Number of nonzero links: 1838 
## Percentage nonzero weights: 1.92499 
## Average number of links: 5.94822 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 11 13 
##  1  4 13 32 78 80 48 32 16  1  3  1 
## 1 least connected region:
## 15 with 1 link
## 1 most connected region:
## 279 with 13 links
plot(mpsz_3414_sp, border=grey(.5))
plot(sg.nb, coordinates(mpsz_3414_sp), col="blue", add=TRUE)

7.3 Computing Minimum Spanning Tree

7.3.1 Calculate Edge Cost

lcosts <- nbcosts(sg.nb, sg_business)
sg.w <- nb2listw(sg.nb, lcosts, style="B")
summary(sg.w)
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 309 
## Number of nonzero links: 1838 
## Percentage nonzero weights: 1.92499 
## Average number of links: 5.94822 
## Link number distribution:
## 
##  1  2  3  4  5  6  7  8  9 10 11 13 
##  1  4 13 32 78 80 48 32 16  1  3  1 
## 1 least connected region:
## 15 with 1 link
## 1 most connected region:
## 279 with 13 links
## 
## Weights style: B 
## Weights constants summary:
##     n    nn     S0        S1         S2
## B 309 95481 350807 250427155 2664023836

7.3.2 Creating Minimum Spanning Tree

sg.mst <- mstree(sg.w)
class(sg.mst)
## [1] "mst"    "matrix"
dim(sg.mst)
## [1] 308   3
head(sg.mst)
##      [,1] [,2]      [,3]
## [1,]    8   50  75.92565
## [2,]   50  122  44.42275
## [3,]  122   74  50.92590
## [4,]   50   24  57.06020
## [5,]    8   45 106.64555
## [6,]   45   72  82.89484
plot(mpsz_sp, border=gray(.5))
plot.mst(sg.mst, coordinates(mpsz_sp), 
     col="blue", cex.lab=0.7, cex.circles=0.005, add=TRUE)

skaterclust <- skater(sg.mst[,1:2], sg_business, method = "euclidean", 9)
str(skaterclust)
## List of 8
##  $ groups      : num [1:309] 1 2 1 1 1 1 2 1 1 3 ...
##  $ edges.groups:List of 10
##   ..$ :List of 3
##   .. ..$ node: num [1:268] 4 284 116 201 87 157 258 100 160 1 ...
##   .. ..$ edge: num [1:267, 1:3] 284 116 201 87 258 157 100 160 1 258 ...
##   .. ..$ ssw : num 29136
##   ..$ :List of 3
##   .. ..$ node: num [1:12] 12 39 37 32 2 21 20 7 56 33 ...
##   .. ..$ edge: num [1:11, 1:3] 2 32 20 37 21 39 32 32 32 39 ...
##   .. ..$ ssw : num 1443
##   ..$ :List of 3
##   .. ..$ node: num [1:12] 10 18 23 16 22 19 57 17 27 34 ...
##   .. ..$ edge: num [1:11, 1:3] 18 16 10 16 16 16 23 22 23 18 ...
##   .. ..$ ssw : num 1050
##   ..$ :List of 3
##   .. ..$ node: num 250
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##   ..$ :List of 3
##   .. ..$ node: num 30
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##   ..$ :List of 3
##   .. ..$ node: num 274
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##   ..$ :List of 3
##   .. ..$ node: num 175
##   .. ..$ edge: num[0 , 1:3] 
##   .. ..$ ssw : num 0
##   ..$ :List of 3
##   .. ..$ node: num [1:8] 47 54 49 14 41 46 77 13
##   .. ..$ edge: num [1:7, 1:3] 54 47 54 47 49 49 49 49 54 46 ...
##   .. ..$ ssw : num 1072
##   ..$ :List of 3
##   .. ..$ node: num [1:3] 53 42 48
##   .. ..$ edge: num [1:2, 1:3] 53 53 42 48 71.4 ...
##   .. ..$ ssw : num 223
##   ..$ :List of 3
##   .. ..$ node: num [1:2] 95 58
##   .. ..$ edge: num [1, 1:3] 95 58 0
##   .. ..$ ssw : num 0
##  $ not.prune   : NULL
##  $ candidates  : int [1:10] 1 2 3 4 5 6 7 8 9 10
##  $ ssto        : num 42810
##  $ ssw         : num [1:10] 42810 40806 39007 37938 36904 ...
##  $ crit        : num [1:2] 1 Inf
##  $ vec.crit    : num [1:309] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "class")= chr "skater"
clustergrps <- skaterclust$groups
table(clustergrps)
## clustergrps
##   1   2   3   4   5   6   7   8   9  10 
## 268  12  12   1   1   1   1   8   3   2
plot(mpsz_sp, border=gray(.5))
plot(skaterclust, coordinates(mpsz_3414_sp), cex.lab=.7,
     groups.colors=c("red","green","blue", "brown", "pink"), cex.circles=0.005, add=TRUE)
## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

## Warning in segments(coords[id1, 1], coords[id1, 2], coords[id2, 1],
## coords[id2, : "add" is not a graphical parameter

groups_mat <- as.matrix(skaterclust$groups)
sg_biz_cluster <- sg_biz_cluster %>%
  filter(!SUBZONE_N %in% c("SUDONG", "SEMAKAU"))
sg_biz_spatialcluster <- cbind(sg_biz_cluster, as.factor(groups_mat)) %>%
  rename(`SP_CLUSTER`=`as.factor.groups_mat.`)
qtm(sg_biz_spatialcluster, "SP_CLUSTER")